<!DOCTYPE html>












  


<html class="theme-next pisces use-motion" lang="zh-CN">
<head><meta name="generator" content="Hexo 3.9.0">
  <meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=2">
<meta name="theme-color" content="#222">
























<link rel="stylesheet" href="/lib/font-awesome/css/font-awesome.min.css?v=4.6.2">

<link rel="stylesheet" href="/css/main.css?v=7.1.2">


  <link rel="apple-touch-icon" sizes="180x180" href="/images/apple-touch-icon-next.png?v=7.1.2">


  <link rel="icon" type="image/png" sizes="32x32" href="/images/favicon-32x32-next.png?v=7.1.2">


  <link rel="icon" type="image/png" sizes="16x16" href="/images/favicon-16x16-next.png?v=7.1.2">


  <link rel="mask-icon" href="/images/logo.svg?v=7.1.2" color="#222">







<script id="hexo.configurations">
  var NexT = window.NexT || {};
  var CONFIG = {
    root: '/',
    scheme: 'Pisces',
    version: '7.1.2',
    sidebar: {"position":"left","display":"post","offset":12,"onmobile":false,"dimmer":false},
    back2top: true,
    back2top_sidebar: false,
    fancybox: false,
    fastclick: false,
    lazyload: false,
    tabs: true,
    motion: {"enable":true,"async":false,"transition":{"post_block":"fadeIn","post_header":"slideDownIn","post_body":"slideDownIn","coll_header":"slideLeftIn","sidebar":"slideUpIn"}},
    algolia: {
      applicationID: '',
      apiKey: '',
      indexName: '',
      hits: {"per_page":10},
      labels: {"input_placeholder":"Search for Posts","hits_empty":"We didn't find any results for the search: ${query}","hits_stats":"${hits} results found in ${time} ms"}
    }
  };
</script>


  




  <meta name="description" content="Python内置的urllib模块，用于访问网络资源。但是，它用起来比较麻烦，而且，缺少很多实用的高级功能。它是一个Python第三方库，处理URL资源特别方便。">
<meta name="keywords" content="爬虫">
<meta property="og:type" content="article">
<meta property="og:title" content="爬虫-requests模块">
<meta property="og:url" content="https://rencheng.cc/2018/08/08/crawler/requests/index.html">
<meta property="og:site_name" content="PHOENIX&#39;s Blog">
<meta property="og:description" content="Python内置的urllib模块，用于访问网络资源。但是，它用起来比较麻烦，而且，缺少很多实用的高级功能。它是一个Python第三方库，处理URL资源特别方便。">
<meta property="og:locale" content="zh-CN">
<meta property="og:updated_time" content="2018-08-08T11:27:33.000Z">
<meta name="twitter:card" content="summary">
<meta name="twitter:title" content="爬虫-requests模块">
<meta name="twitter:description" content="Python内置的urllib模块，用于访问网络资源。但是，它用起来比较麻烦，而且，缺少很多实用的高级功能。它是一个Python第三方库，处理URL资源特别方便。">



  <link rel="alternate" href="/atom.xml" title="PHOENIX's Blog" type="application/atom+xml">



  
  
  <link rel="canonical" href="https://rencheng.cc/2018/08/08/crawler/requests/">



<script id="page.configurations">
  CONFIG.page = {
    sidebar: "",
  };
</script>

  <title>爬虫-requests模块 | PHOENIX's Blog</title>
  












  <noscript>
  <style>
  .use-motion .motion-element,
  .use-motion .brand,
  .use-motion .menu-item,
  .sidebar-inner,
  .use-motion .post-block,
  .use-motion .pagination,
  .use-motion .comments,
  .use-motion .post-header,
  .use-motion .post-body,
  .use-motion .collection-title { opacity: initial; }

  .use-motion .logo,
  .use-motion .site-title,
  .use-motion .site-subtitle {
    opacity: initial;
    top: initial;
  }

  .use-motion .logo-line-before i { left: initial; }
  .use-motion .logo-line-after i { right: initial; }
  </style>
</noscript>

</head>

<body itemscope itemtype="http://schema.org/WebPage" lang="zh-CN">

  
  
    
  

  <div class="container sidebar-position-left page-post-detail">
    <div class="headband"></div>

    <header id="header" class="header" itemscope itemtype="http://schema.org/WPHeader">
      <div class="header-inner"><div class="site-brand-wrapper">
  <div class="site-meta">
    

    <div class="custom-logo-site-title">
      <a href="/" class="brand" rel="start">
        <span class="logo-line-before"><i></i></span>
        <span class="site-title">PHOENIX's Blog</span>
        <span class="logo-line-after"><i></i></span>
      </a>
    </div>
    
      
        <h1 class="site-subtitle" itemprop="description">Enjoy the journey of life</h1>
      
    
    
  </div>

  <div class="site-nav-toggle">
    <button aria-label="切换导航栏">
      <span class="btn-bar"></span>
      <span class="btn-bar"></span>
      <span class="btn-bar"></span>
    </button>
  </div>
</div>



<nav class="site-nav">
  
    <ul id="menu" class="menu">
      
        
        
        
          
          <li class="menu-item menu-item-home">

    
    
    
      
    

    

    <a href="/" rel="section"><i class="menu-item-icon fa fa-fw fa-home"></i> <br>首页</a>

  </li>
        
        
        
          
          <li class="menu-item menu-item-tags">

    
    
    
      
    

    

    <a href="/tags/" rel="section"><i class="menu-item-icon fa fa-fw fa-tags"></i> <br>标签</a>

  </li>
        
        
        
          
          <li class="menu-item menu-item-categories">

    
    
    
      
    

    

    <a href="/categories/" rel="section"><i class="menu-item-icon fa fa-fw fa-th"></i> <br>分类</a>

  </li>
        
        
        
          
          <li class="menu-item menu-item-archives">

    
    
    
      
    

    

    <a href="/archives/" rel="section"><i class="menu-item-icon fa fa-fw fa-archive"></i> <br>归档</a>

  </li>
        
        
        
          
          <li class="menu-item menu-item-about">

    
    
    
      
    

    

    <a href="/about/" rel="section"><i class="menu-item-icon fa fa-fw fa-user"></i> <br>关于</a>

  </li>

      
      
        <li class="menu-item menu-item-search">
          
            <a href="javascript:;" class="popup-trigger">
          
            
              <i class="menu-item-icon fa fa-search fa-fw"></i> <br>搜索</a>
        </li>
      
    </ul>
  

  

  
    <div class="site-search">
      
  <div class="popup search-popup local-search-popup">
  <div class="local-search-header clearfix">
    <span class="search-icon">
      <i class="fa fa-search"></i>
    </span>
    <span class="popup-btn-close">
      <i class="fa fa-times-circle"></i>
    </span>
    <div class="local-search-input-wrapper">
      <input autocomplete="off" placeholder="搜索..." spellcheck="false" type="text" id="local-search-input">
    </div>
  </div>
  <div id="local-search-result"></div>
</div>



    </div>
  
</nav>



  



</div>
    </header>

    


    <main id="main" class="main">
      <div class="main-inner">
        <div class="content-wrap">
          
            

          
          <div id="content" class="content">
            

  <div id="posts" class="posts-expand">
    

  

  
  
  

  

  <article class="post post-type-normal" itemscope itemtype="http://schema.org/Article">
  
  
  
  <div class="post-block">
    <link itemprop="mainEntityOfPage" href="https://rencheng.cc/2018/08/08/crawler/requests/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="name" content="任成">
      <meta itemprop="description" content>
      <meta itemprop="image" content="/images/avatar.JPG">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="PHOENIX's Blog">
    </span>

    
      <header class="post-header">

        
        
          <h2 class="post-title" itemprop="name headline">爬虫-requests模块

              
            
          </h2>
        

        <div class="post-meta">
          <span class="post-time">

            
            
            

            
              <span class="post-meta-item-icon">
                <i class="fa fa-calendar-o"></i>
              </span>
              
                <span class="post-meta-item-text">发表于</span>
              

              
                
              

              <time title="创建时间：2018-08-08 19:27:33" itemprop="dateCreated datePublished" datetime="2018-08-08T19:27:33+08:00">2018-08-08</time>
            

            
          </span>

          
            <span class="post-category">
            
              <span class="post-meta-divider">|</span>
            
              <span class="post-meta-item-icon">
                <i class="fa fa-folder-o"></i>
              </span>
              
                <span class="post-meta-item-text">分类于</span>
              
              
                <span itemprop="about" itemscope itemtype="http://schema.org/Thing"><a href="/categories/教程/" itemprop="url" rel="index"><span itemprop="name">教程</span></a></span>

                
                
              
            </span>
          

          
            
            
          

          
          

          

          
            <div class="post-symbolscount">
              

              
                <span class="post-meta-item-icon">
                  <i class="fa fa-file-word-o"></i>
                </span>
                
                <span title="本文字数">3.1k</span>
              

              

              
            </div>
          

          

        </div>
      </header>
    

    
    
    
    <div class="post-body" itemprop="articleBody">

      
      

      
        <p>Python内置的urllib模块，用于访问网络资源。但是，它用起来比较麻烦，而且，缺少很多实用的高级功能。<br>它是一个Python第三方库，处理URL资源特别方便。<br><a id="more"></a></p>
<h1 id="简单实用"><a href="#简单实用" class="headerlink" title="简单实用"></a>简单实用</h1><ol>
<li><p>res = get(url, params=params, headers=headers)</p>
 <figure class="highlight crmsh"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br></pre></td><td class="code"><pre><span class="line">请求：</span><br><span class="line">会自动对<span class="keyword">params</span>进行编码,并和前面url进行拼接</span><br></pre></td></tr></table></figure>
 <figure class="highlight applescript"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre></td><td class="code"><pre><span class="line">参数：</span><br><span class="line">url： 请求地址</span><br><span class="line">params：请求参数</span><br><span class="line">headers：请求头部</span><br><span class="line"><span class="keyword">timeout</span>: 超时时间</span><br><span class="line">proxies：代理参数【见附录<span class="number">1</span>】</span><br></pre></td></tr></table></figure>
 <figure class="highlight lsl"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre></td><td class="code"><pre><span class="line">响应对象res的属性</span><br><span class="line"><span class="number">1</span>、encoding ：指定响应编码, res.encoding = <span class="string">"utf-8"</span></span><br><span class="line"><span class="number">2</span>、text     ：字符串</span><br><span class="line"><span class="number">3</span>、content  ：字节流</span><br><span class="line"><span class="number">4</span>、status_code ：HTTP响应码</span><br><span class="line"><span class="number">5</span>、url         ：返回实际数据的URL地址</span><br></pre></td></tr></table></figure>
</li>
<li><p>非结构化数据保存</p>
 <figure class="highlight py"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre></td><td class="code"><pre><span class="line">html = res.content</span><br><span class="line"><span class="keyword">with</span> open(<span class="string">"xxx"</span>,<span class="string">"wb"</span>) <span class="keyword">as</span> f:</span><br><span class="line">    f.write(html)</span><br></pre></td></tr></table></figure>
</li>
<li><p>post(url, data=data, headers=headers)</p>
 <figure class="highlight fortran"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">data</span> ：<span class="keyword">Form</span>表单数据,字典,不用编码,不用转码</span><br></pre></td></tr></table></figure>
</li>
</ol>
<h1 id="进阶指南"><a href="#进阶指南" class="headerlink" title="进阶指南"></a>进阶指南</h1><ol>
<li><p>requests的方便之处还在于，对于特定类型的响应，例如JSON，可以直接获取：</p>
 <figure class="highlight py"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br></pre></td><td class="code"><pre><span class="line">r = requests.get(*******)</span><br><span class="line">r.json()</span><br></pre></td></tr></table></figure>
</li>
<li><p>requests默认使用<code>application/x-www-form-urlencoded</code>对POST数据编码。如果要传递JSON数据，可以直接传入json参数：</p>
 <figure class="highlight py"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br></pre></td><td class="code"><pre><span class="line">    params = &#123;<span class="string">'key'</span>: <span class="string">'value'</span>&#125;</span><br><span class="line">r = requests.post(url, json=params) <span class="comment"># 内部自动序列化为JSON</span></span><br></pre></td></tr></table></figure>
</li>
<li><p>类似的，上传文件需要更复杂的编码格式，但是requests把它简化成files参数：</p>
 <figure class="highlight py"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br></pre></td><td class="code"><pre><span class="line">upload_files = &#123;<span class="string">'file'</span>: open(<span class="string">'report.xls'</span>, <span class="string">'rb'</span>)&#125;</span><br><span class="line">r = requests.post(url, files=upload_files)</span><br></pre></td></tr></table></figure>
<blockquote>
<p>在读取文件时，注意务必使用’rb’即二进制模式读取，这样获取的bytes长度才是文件的长度。</p>
</blockquote>
</li>
<li><p>除了能轻松获取响应内容外，requests对获取HTTP响应的其他信息也非常简单。例如，获取响应头：</p>
 <figure class="highlight ruby"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br></pre></td><td class="code"><pre><span class="line"><span class="meta">&gt;&gt;</span>&gt; r.headers</span><br><span class="line">&#123;Content-Type<span class="string">': '</span>text/html; charset=utf-<span class="number">8</span><span class="string">', '</span>Transfer-Encoding<span class="string">': '</span>chunked<span class="string">', '</span>Content-Encoding<span class="string">': '</span>gzip<span class="string">', ...&#125;</span></span><br></pre></td></tr></table></figure>
</li>
</ol>
<h1 id="有道翻译案例"><a href="#有道翻译案例" class="headerlink" title="有道翻译案例"></a>有道翻译案例</h1><figure class="highlight py"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> requests</span><br><span class="line"><span class="keyword">import</span> json</span><br><span class="line"></span><br><span class="line"><span class="comment"># 得到data并把它转为bytes</span></span><br><span class="line">key = input(<span class="string">"请输入要翻译的内容:"</span>)</span><br><span class="line"><span class="comment"># 1. 把Form表单数据定义为字典,F12-&gt;Form Data</span></span><br><span class="line">data = &#123;</span><br><span class="line">        <span class="string">"i"</span>:key,</span><br><span class="line">        <span class="string">"from"</span>:<span class="string">"AUTO"</span>,</span><br><span class="line">        <span class="string">"to"</span>:<span class="string">"AUTO"</span>,</span><br><span class="line">        <span class="string">"smartresult"</span>:<span class="string">"dict"</span>,</span><br><span class="line">        <span class="string">"client"</span>:<span class="string">"fanyideskweb"</span>,</span><br><span class="line">        <span class="string">"salt"</span>:<span class="string">"15437179202229"</span>,</span><br><span class="line">        <span class="string">"sign"</span>:<span class="string">"b5fee8d2268e22191d3e03ea884d5666"</span>,</span><br><span class="line">        <span class="string">"doctype"</span>:<span class="string">"json"</span>,</span><br><span class="line">        <span class="string">"version"</span>:<span class="string">"2.1"</span>,</span><br><span class="line">        <span class="string">"keyfrom"</span>:<span class="string">"fanyi.web"</span>,</span><br><span class="line">        <span class="string">"action"</span>:<span class="string">"FY_BY_REALTIME"</span>,</span><br><span class="line">        <span class="string">"typoResult"</span>:<span class="string">"false"</span></span><br><span class="line">    &#125;</span><br><span class="line"></span><br><span class="line"><span class="comment"># 发请求获响应</span></span><br><span class="line"><span class="comment"># url为抓包抓到的POST的地址,去掉translate_o中的 _o</span></span><br><span class="line">url = <span class="string">"http://fanyi.youdao.com/translate?smartresult=dict&amp;smartresult=rule"</span></span><br><span class="line">headers = &#123;<span class="string">"User-Agent"</span>:<span class="string">"Mozilla/5.0"</span>&#125;</span><br><span class="line"><span class="comment"># 用requests模块的post方法,data参数为Form表单数据,必须为字典</span></span><br><span class="line">res = requests.post(url,data=data,headers=headers)</span><br><span class="line">res.encoding = <span class="string">"utf-8"</span></span><br><span class="line">html = res.text</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="comment"># 把json格式的字符串转为Python中的字典</span></span><br><span class="line">r_dict = json.loads(html)</span><br><span class="line">r = r_dict[<span class="string">"translateResult"</span>][<span class="number">0</span>][<span class="number">0</span>][<span class="string">"tgt"</span>]</span><br><span class="line">print(r)</span><br></pre></td></tr></table></figure>
<h1 id="附录一"><a href="#附录一" class="headerlink" title="附录一"></a>附录一</h1><h2 id="查询本机公网IP"><a href="#查询本机公网IP" class="headerlink" title="查询本机公网IP"></a>查询本机公网IP</h2><ol>
<li>百度搜索<code>IP</code></li>
<li>请求地址：<span class="exturl" data-url="aHR0cDovL2h0dHBiaW4ub3JnL2dldA==" title="http://httpbin.org/get">http://httpbin.org/get<i class="fa fa-external-link"></i></span></li>
</ol>
<h2 id="代理参数-：proxies-—-gt-字典"><a href="#代理参数-：proxies-—-gt-字典" class="headerlink" title="代理参数 ：proxies —&gt; 字典"></a>代理参数 ：proxies —&gt; 字典</h2><ol>
<li><p>获取代理IP的网站</p>
<ol>
<li><span class="exturl" data-url="aHR0cHM6Ly93d3cueGljaWRhaWxpLmNvbS9ubg==" title="https://www.xicidaili.com/nn">西刺代理<i class="fa fa-external-link"></i></span></li>
<li><span class="exturl" data-url="aHR0cHM6Ly93d3cua3VhaWRhaWxpLmNvbS9mcmVlL2luaGEv" title="https://www.kuaidaili.com/free/inha/">快代理<i class="fa fa-external-link"></i></span></li>
<li><span class="exturl" data-url="aHR0cDovL3d3dy5nb3ViYW5qaWEuY29tLw==" title="http://www.goubanjia.com/">全网代理<i class="fa fa-external-link"></i></span></li>
</ol>
</li>
<li><p>普通代理 ：字典<br>proxies = {“协议”:”协议://IP地址:端口号”}</p>
 <figure class="highlight py"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br></pre></td><td class="code"><pre><span class="line">proxies = &#123;<span class="string">"http"</span>:<span class="string">"http://183.129.207.82:11328"</span>&#125;</span><br><span class="line">res = requests.get(url,proxies=proies,headers..)</span><br></pre></td></tr></table></figure>
</li>
<li><p>私密代理 ：字典<br>proxies = {“协议”:”协议://用户名:密码@IP:端口”}</p>
 <figure class="highlight py"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">proxies = &#123;<span class="string">"http"</span>:<span class="string">"http://309435365:szayclhp@116.255.162.107:16816"</span> &#125;</span><br></pre></td></tr></table></figure>
</li>
</ol>
<h2 id="示例"><a href="#示例" class="headerlink" title="示例"></a>示例</h2><h3 id="普通代理"><a href="#普通代理" class="headerlink" title="普通代理"></a>普通代理</h3><figure class="highlight py"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> requests</span><br><span class="line"><span class="keyword">import</span> random</span><br><span class="line"></span><br><span class="line">url = <span class="string">"http://httpbin.org/get"</span></span><br><span class="line"><span class="comment">#url = "http://www.baidu.com/"</span></span><br><span class="line">headers = &#123;<span class="string">"User-Agent"</span>:<span class="string">"Mozilla/5.0"</span>&#125;</span><br><span class="line"></span><br><span class="line"><span class="comment"># IP代理池</span></span><br><span class="line">proxyList = [</span><br><span class="line">        &#123;<span class="string">""</span>:<span class="string">""</span>&#125;,</span><br><span class="line">        &#123;<span class="string">""</span>:<span class="string">""</span>&#125;,</span><br><span class="line">        &#123;<span class="string">""</span>:<span class="string">""</span>&#125;,</span><br><span class="line">    ]</span><br><span class="line">proxies = random.choice(proxyList)</span><br><span class="line"></span><br><span class="line">res = requests.get(url,proxies=proxies,</span><br><span class="line">                   headers=headers)</span><br><span class="line">res.encoding = <span class="string">"utf-8"</span></span><br><span class="line">html = res.text</span><br><span class="line">print(html)</span><br></pre></td></tr></table></figure>
<h3 id="私密代理"><a href="#私密代理" class="headerlink" title="私密代理"></a>私密代理</h3><figure class="highlight py"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> requests</span><br><span class="line"></span><br><span class="line"><span class="comment">#url = "http://httpbin.org/get"</span></span><br><span class="line">url = <span class="string">"http://www.baidu.com/"</span></span><br><span class="line">headers = &#123;<span class="string">"User-Agent"</span>:<span class="string">"Mozilla/5.0"</span>&#125;</span><br><span class="line">proxies = &#123;<span class="string">"http"</span>:<span class="string">"http://309435365:szayclhp@116.255.162.107:16816"</span>&#125;</span><br><span class="line"></span><br><span class="line">res = requests.get(url,proxies=proxies,</span><br><span class="line">                   headers=headers)</span><br><span class="line">res.encoding = <span class="string">"utf-8"</span></span><br><span class="line">print(res.text)</span><br></pre></td></tr></table></figure>
      
    </div>

    

    
    
    

    

    
      
    
    

    

    <footer class="post-footer">
      
        <div class="post-tags">
          
            <a href="/tags/爬虫/" rel="tag"># 爬虫</a>
          
        </div>
      

      
      
      

      
        <div class="post-nav">
          <div class="post-nav-next post-nav-item">
            
              <a href="/2018/08/08/crawler/数据持久化/" rel="next" title="爬虫-数据持久化">
                <i class="fa fa-chevron-left"></i> 爬虫-数据持久化
              </a>
            
          </div>

          <span class="post-nav-divider"></span>

          <div class="post-nav-prev post-nav-item">
            
              <a href="/2018/08/08/crawler/urllib/" rel="prev" title="爬虫-urllib">
                爬虫-urllib <i class="fa fa-chevron-right"></i>
              </a>
            
          </div>
        </div>
      

      
      
    </footer>
  </div>
  
  
  
  </article>


  </div>


          </div>
          

  



        </div>
        
          
  
  <div class="sidebar-toggle">
    <div class="sidebar-toggle-line-wrap">
      <span class="sidebar-toggle-line sidebar-toggle-line-first"></span>
      <span class="sidebar-toggle-line sidebar-toggle-line-middle"></span>
      <span class="sidebar-toggle-line sidebar-toggle-line-last"></span>
    </div>
  </div>

  <aside id="sidebar" class="sidebar">
    <div class="sidebar-inner">

      

      
        <ul class="sidebar-nav motion-element">
          <li class="sidebar-nav-toc sidebar-nav-active" data-target="post-toc-wrap">
            文章目录
          </li>
          <li class="sidebar-nav-overview" data-target="site-overview-wrap">
            站点概览
          </li>
        </ul>
      

      <div class="site-overview-wrap sidebar-panel">
        <div class="site-overview">
          <div class="site-author motion-element" itemprop="author" itemscope itemtype="http://schema.org/Person">
            
              <img class="site-author-image" itemprop="image" src="/images/avatar.JPG" alt="任成">
            
              <p class="site-author-name" itemprop="name">任成</p>
              <div class="site-description motion-element" itemprop="description"></div>
          </div>

          

          
            <div class="feed-link motion-element">
              <a href="/atom.xml" rel="alternate">
                <i class="fa fa-rss"></i>
                RSS
              </a>
            </div>
          

          

          
            <div class="links-of-author motion-element">
              
                <span class="links-of-author-item">
                  
                  
                    
                  
                  
                    
                  
                  <span class="exturl" data-url="aHR0cHM6Ly9naXRodWIuY29tL0NvZGVyQ3VydmU=" title="GitHub &rarr; https://github.com/CoderCurve"><i class="fa fa-fw fa-github"></i>GitHub</span>
                </span>
              
                <span class="links-of-author-item">
                  
                  
                    
                  
                  
                    
                  
                  <span class="exturl" data-url="bWFpbHRvOnJlbmNoZW5nMTFAaWNsb3VkLmNvbQ==" title="E-Mail &rarr; mailto:rencheng11@icloud.com"><i class="fa fa-fw fa-envelope"></i>E-Mail</span>
                </span>
              
            </div>
          

          

          
          

          
            
          
          

        </div>
      </div>

      
      <!--noindex-->
        <div class="post-toc-wrap motion-element sidebar-panel sidebar-panel-active">
          <div class="post-toc">

            
            
            
            

            
              <div class="post-toc-content"><ol class="nav"><li class="nav-item nav-level-1"><a class="nav-link" href="#简单实用"><span class="nav-number">1.</span> <span class="nav-text">简单实用</span></a></li><li class="nav-item nav-level-1"><a class="nav-link" href="#进阶指南"><span class="nav-number">2.</span> <span class="nav-text">进阶指南</span></a></li><li class="nav-item nav-level-1"><a class="nav-link" href="#有道翻译案例"><span class="nav-number">3.</span> <span class="nav-text">有道翻译案例</span></a></li><li class="nav-item nav-level-1"><a class="nav-link" href="#附录一"><span class="nav-number">4.</span> <span class="nav-text">附录一</span></a><ol class="nav-child"><li class="nav-item nav-level-2"><a class="nav-link" href="#查询本机公网IP"><span class="nav-number">4.1.</span> <span class="nav-text">查询本机公网IP</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#代理参数-：proxies-—-gt-字典"><span class="nav-number">4.2.</span> <span class="nav-text">代理参数 ：proxies —&gt; 字典</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#示例"><span class="nav-number">4.3.</span> <span class="nav-text">示例</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#普通代理"><span class="nav-number">4.3.1.</span> <span class="nav-text">普通代理</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#私密代理"><span class="nav-number">4.3.2.</span> <span class="nav-text">私密代理</span></a></li></ol></li></ol></li></ol></div>
            

          </div>
        </div>
      <!--/noindex-->
      

      

    </div>
  </aside>
  


        
      </div>
    </main>

    <footer id="footer" class="footer">
      <div class="footer-inner">
        <div class="copyright">  <span class="exturl" data-url="aHR0cDovL3d3dy5iZWlhbi5taWl0Lmdvdi5jbg==">京ICP备18001214号 </span>&copy; 2014 – <span itemprop="copyrightYear">2019</span>
  <span class="with-love" id="animate">
    <i class="fa fa-user"></i>
  </span>
  <span class="author" itemprop="copyrightHolder">任成</span>

  
    <span class="post-meta-divider">|</span>
    <span class="post-meta-item-icon">
      <i class="fa fa-area-chart"></i>
    </span>
    
    <span title="站点总字数">277k</span>
  

  
</div>









        








        
      </div>
    </footer>

    
      <div class="back-to-top">
        <i class="fa fa-arrow-up"></i>
        
      </div>
    

    

    

    
  </div>

  

<script>
  if (Object.prototype.toString.call(window.Promise) !== '[object Function]') {
    window.Promise = null;
  }
</script>


























  
  <script src="/lib/jquery/index.js?v=2.1.3"></script>

  
  <script src="/lib/velocity/velocity.min.js?v=1.2.1"></script>

  
  <script src="/lib/velocity/velocity.ui.min.js?v=1.2.1"></script>


  


  <script src="/js/utils.js?v=7.1.2"></script>

  <script src="/js/motion.js?v=7.1.2"></script>



  
  


  <script src="/js/affix.js?v=7.1.2"></script>

  <script src="/js/schemes/pisces.js?v=7.1.2"></script>



  
  <script src="/js/scrollspy.js?v=7.1.2"></script>
<script src="/js/post-details.js?v=7.1.2"></script>



  


  <script src="/js/next-boot.js?v=7.1.2"></script>


  

  
  <script src="/js/exturl.js?v=7.1.2"></script>


  

  


  


  
  <script>
    // Popup Window;
    var isfetched = false;
    var isXml = true;
    // Search DB path;
    var search_path = "search.xml";
    if (search_path.length === 0) {
      search_path = "search.xml";
    } else if (/json$/i.test(search_path)) {
      isXml = false;
    }
    var path = "/" + search_path;
    // monitor main search box;

    var onPopupClose = function (e) {
      $('.popup').hide();
      $('#local-search-input').val('');
      $('.search-result-list').remove();
      $('#no-result').remove();
      $(".local-search-pop-overlay").remove();
      $('body').css('overflow', '');
    }

    function proceedsearch() {
      $("body")
        .append('<div class="search-popup-overlay local-search-pop-overlay"></div>')
        .css('overflow', 'hidden');
      $('.search-popup-overlay').click(onPopupClose);
      $('.popup').toggle();
      var $localSearchInput = $('#local-search-input');
      $localSearchInput.attr("autocapitalize", "none");
      $localSearchInput.attr("autocorrect", "off");
      $localSearchInput.focus();
    }

    // search function;
    var searchFunc = function(path, search_id, content_id) {
      'use strict';

      // start loading animation
      $("body")
        .append('<div class="search-popup-overlay local-search-pop-overlay">' +
          '<div id="search-loading-icon">' +
          '<i class="fa fa-spinner fa-pulse fa-5x fa-fw"></i>' +
          '</div>' +
          '</div>')
        .css('overflow', 'hidden');
      $("#search-loading-icon").css('margin', '20% auto 0 auto').css('text-align', 'center');

      

      $.ajax({
        url: path,
        dataType: isXml ? "xml" : "json",
        async: true,
        success: function(res) {
          // get the contents from search data
          isfetched = true;
          $('.popup').detach().appendTo('.header-inner');
          var datas = isXml ? $("entry", res).map(function() {
            return {
              title: $("title", this).text(),
              content: $("content",this).text(),
              url: $("url" , this).text()
            };
          }).get() : res;
          var input = document.getElementById(search_id);
          var resultContent = document.getElementById(content_id);
          var inputEventFunction = function() {
            var searchText = input.value.trim().toLowerCase();
            var keywords = searchText.split(/[\s\-]+/);
            if (keywords.length > 1) {
              keywords.push(searchText);
            }
            var resultItems = [];
            if (searchText.length > 0) {
              // perform local searching
              datas.forEach(function(data) {
                var isMatch = false;
                var hitCount = 0;
                var searchTextCount = 0;
                var title = data.title.trim();
                var titleInLowerCase = title.toLowerCase();
                var content = data.content.trim().replace(/<[^>]+>/g,"");
                
                var contentInLowerCase = content.toLowerCase();
                var articleUrl = decodeURIComponent(data.url).replace(/\/{2,}/g, '/');
                var indexOfTitle = [];
                var indexOfContent = [];
                // only match articles with not empty titles
                if(title != '') {
                  keywords.forEach(function(keyword) {
                    function getIndexByWord(word, text, caseSensitive) {
                      var wordLen = word.length;
                      if (wordLen === 0) {
                        return [];
                      }
                      var startPosition = 0, position = [], index = [];
                      if (!caseSensitive) {
                        text = text.toLowerCase();
                        word = word.toLowerCase();
                      }
                      while ((position = text.indexOf(word, startPosition)) > -1) {
                        index.push({position: position, word: word});
                        startPosition = position + wordLen;
                      }
                      return index;
                    }

                    indexOfTitle = indexOfTitle.concat(getIndexByWord(keyword, titleInLowerCase, false));
                    indexOfContent = indexOfContent.concat(getIndexByWord(keyword, contentInLowerCase, false));
                  });
                  if (indexOfTitle.length > 0 || indexOfContent.length > 0) {
                    isMatch = true;
                    hitCount = indexOfTitle.length + indexOfContent.length;
                  }
                }

                // show search results

                if (isMatch) {
                  // sort index by position of keyword

                  [indexOfTitle, indexOfContent].forEach(function (index) {
                    index.sort(function (itemLeft, itemRight) {
                      if (itemRight.position !== itemLeft.position) {
                        return itemRight.position - itemLeft.position;
                      } else {
                        return itemLeft.word.length - itemRight.word.length;
                      }
                    });
                  });

                  // merge hits into slices

                  function mergeIntoSlice(text, start, end, index) {
                    var item = index[index.length - 1];
                    var position = item.position;
                    var word = item.word;
                    var hits = [];
                    var searchTextCountInSlice = 0;
                    while (position + word.length <= end && index.length != 0) {
                      if (word === searchText) {
                        searchTextCountInSlice++;
                      }
                      hits.push({position: position, length: word.length});
                      var wordEnd = position + word.length;

                      // move to next position of hit

                      index.pop();
                      while (index.length != 0) {
                        item = index[index.length - 1];
                        position = item.position;
                        word = item.word;
                        if (wordEnd > position) {
                          index.pop();
                        } else {
                          break;
                        }
                      }
                    }
                    searchTextCount += searchTextCountInSlice;
                    return {
                      hits: hits,
                      start: start,
                      end: end,
                      searchTextCount: searchTextCountInSlice
                    };
                  }

                  var slicesOfTitle = [];
                  if (indexOfTitle.length != 0) {
                    slicesOfTitle.push(mergeIntoSlice(title, 0, title.length, indexOfTitle));
                  }

                  var slicesOfContent = [];
                  while (indexOfContent.length != 0) {
                    var item = indexOfContent[indexOfContent.length - 1];
                    var position = item.position;
                    var word = item.word;
                    // cut out 100 characters
                    var start = position - 20;
                    var end = position + 80;
                    if(start < 0){
                      start = 0;
                    }
                    if (end < position + word.length) {
                      end = position + word.length;
                    }
                    if(end > content.length){
                      end = content.length;
                    }
                    slicesOfContent.push(mergeIntoSlice(content, start, end, indexOfContent));
                  }

                  // sort slices in content by search text's count and hits' count

                  slicesOfContent.sort(function (sliceLeft, sliceRight) {
                    if (sliceLeft.searchTextCount !== sliceRight.searchTextCount) {
                      return sliceRight.searchTextCount - sliceLeft.searchTextCount;
                    } else if (sliceLeft.hits.length !== sliceRight.hits.length) {
                      return sliceRight.hits.length - sliceLeft.hits.length;
                    } else {
                      return sliceLeft.start - sliceRight.start;
                    }
                  });

                  // select top N slices in content

                  var upperBound = parseInt('1');
                  if (upperBound >= 0) {
                    slicesOfContent = slicesOfContent.slice(0, upperBound);
                  }

                  // highlight title and content

                  function highlightKeyword(text, slice) {
                    var result = '';
                    var prevEnd = slice.start;
                    slice.hits.forEach(function (hit) {
                      result += text.substring(prevEnd, hit.position);
                      var end = hit.position + hit.length;
                      result += '<b class="search-keyword">' + text.substring(hit.position, end) + '</b>';
                      prevEnd = end;
                    });
                    result += text.substring(prevEnd, slice.end);
                    return result;
                  }

                  var resultItem = '';

                  if (slicesOfTitle.length != 0) {
                    resultItem += "<li><a href='" + articleUrl + "' class='search-result-title'>" + highlightKeyword(title, slicesOfTitle[0]) + "</a>";
                  } else {
                    resultItem += "<li><a href='" + articleUrl + "' class='search-result-title'>" + title + "</a>";
                  }

                  slicesOfContent.forEach(function (slice) {
                    resultItem += "<a href='" + articleUrl + "'>" +
                      "<p class=\"search-result\">" + highlightKeyword(content, slice) +
                      "...</p>" + "</a>";
                  });

                  resultItem += "</li>";
                  resultItems.push({
                    item: resultItem,
                    searchTextCount: searchTextCount,
                    hitCount: hitCount,
                    id: resultItems.length
                  });
                }
              })
            };
            if (keywords.length === 1 && keywords[0] === "") {
              resultContent.innerHTML = '<div id="no-result"><i class="fa fa-search fa-5x"></i></div>'
            } else if (resultItems.length === 0) {
              resultContent.innerHTML = '<div id="no-result"><i class="fa fa-frown-o fa-5x"></i></div>'
            } else {
              resultItems.sort(function (resultLeft, resultRight) {
                if (resultLeft.searchTextCount !== resultRight.searchTextCount) {
                  return resultRight.searchTextCount - resultLeft.searchTextCount;
                } else if (resultLeft.hitCount !== resultRight.hitCount) {
                  return resultRight.hitCount - resultLeft.hitCount;
                } else {
                  return resultRight.id - resultLeft.id;
                }
              });
              var searchResultList = '<ul class=\"search-result-list\">';
              resultItems.forEach(function (result) {
                searchResultList += result.item;
              })
              searchResultList += "</ul>";
              resultContent.innerHTML = searchResultList;
            }
          }

          if ('auto' === 'auto') {
            input.addEventListener('input', inputEventFunction);
          } else {
            $('.search-icon').click(inputEventFunction);
            input.addEventListener('keypress', function (event) {
              if (event.keyCode === 13) {
                inputEventFunction();
              }
            });
          }

          // remove loading animation
          $(".local-search-pop-overlay").remove();
          $('body').css('overflow', '');

          proceedsearch();
        }
      });
    }

    // handle and trigger popup window;
    $('.popup-trigger').click(function(e) {
      e.stopPropagation();
      if (isfetched === false) {
        searchFunc(path, 'local-search-input', 'local-search-result');
      } else {
        proceedsearch();
      };
    });

    $('.popup-btn-close').click(onPopupClose);
    $('.popup').click(function(e){
      e.stopPropagation();
    });
    $(document).on('keyup', function (event) {
      var shouldDismissSearchPopup = event.which === 27 &&
        $('.search-popup').is(':visible');
      if (shouldDismissSearchPopup) {
        onPopupClose();
      }
    });
  </script>





  

  

  
  

  
  

  


  

  
<script>
if ($('body').find('pre.mermaid').length) {
  $.ajax({
    type: 'GET',
    url: '//cdn.jsdelivr.net/npm/mermaid@8/dist/mermaid.min.js',
    dataType: 'script',
    cache: true,
    success: function() {
      mermaid.initialize({
        theme: 'forest',
        logLevel: 3,
        flowchart: { curve: 'linear' },
        gantt: { axisFormat: '%m/%d/%Y' },
        sequence: { actorMargin: 50 }
      });
    }
  });
}
</script>


  

  

  

  

  

  

  

  

  

</body>
</html>
